Bus Network Graph: Experiments on real data¶


In [ ]:
from helper import *
from network import *
from stop import Stop
from variant import Variant
from path import Path
from queries import StopQuery, VariantQuery, PathQuery

from ipyleaflet import Map, GeoJSON
import pandas as pd
import plotly.express as px
from tqdm import tqdm

import timeit

import plotly.io as pio
pio.renderers.default = "jupyterlab"
unavailable until proj_trans is called
In [ ]:
net_analysis = NetworkAnalysisBetweenness()
dijkstra = BusNetworkDijkstra()
net = BusNetwork.from_ndjsons(sides_set_type='spatial')
sides_set_type =  spatial
100%|██████████| 297/297 [00:07<00:00, 39.68it/s]
In [ ]:
stops = StopQuery.from_ndjson()
vars = VariantQuery.from_ndjson()

Degree distribution of Stops¶

In [ ]:
def mod_row(row):
    if row['StopType'] in [
        'Ô sơn',
        'Biển treo',
        'Trạm tạm',
        'Bến bãi QH chung QH',
        'Ga Metro Số 2',
        'Bến Bãi QH 568'
    ]:
        row['StopType'] = 'Others'

    return row

df_nodes = pd.DataFrame([
    stop.to_dict()
    for stop in net.nodes.values()
]).apply(mod_row, axis=1)

df_nodes['Degree'] = df_nodes['Routes'].apply(lambda s: len(s.split()))
df_nodes = df_nodes[['StopId', 'StopType', 'Degree']]
df_nodes
Out[ ]:
StopId StopType Degree
0 35 Bến xe 26
1 7276 Trụ dừng 21
2 7277 Trụ dừng 19
3 7278 Nhà chờ 14
4 7265 Nhà chờ 5
... ... ... ...
4392 7682 Others 1
4393 7683 Others 1
4394 7684 Others 1
4395 7685 Others 1
4396 7686 Others 1

4397 rows × 3 columns

In [ ]:
bins = [*range(15), 15, 50]
lens = df_nodes.groupby(pd.cut(df_nodes['Degree'], bins=bins)).apply(len)

bins[-1] = '15+'
fig = px.bar(y=bins[1:], x=lens, height=800, width=1400, log_x=True, text_auto=True, color=lens, orientation='h')
fig.update_yaxes(type='category')

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(
    title="Degree distribution of bus stops",
    xaxis_title="Count",
    yaxis_title="Degree (# of routes passing)",
    # legend_title="Legend Title",
    font=dict(
        size=18,
        # color="RebeccaPurple"
    )
)
# fig.update_traces(textposition='outside')

fig.show(renderer='notebook')
C:\Users\ADMIN\AppData\Local\Temp\ipykernel_25672\1233675535.py:2: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

In [ ]:
bins = [*range(15), 15, 50]
lens_obj = df_nodes.groupby('StopType').apply(
    # # print
    # axis='index',
    lambda g: (g.groupby(pd.cut(df_nodes['Degree'], bins=bins)).apply(len))
    # lambda g: g.value_counts()
).stack().to_dict()

lens_obj

def get_label(label):
    if label.right > 15:
        return '15+'
    return str(label.right)

lens_df = pd.DataFrame([
    {
        'StopType': stop_type,
        'Label': get_label(label),
        'Count': count
    }
    for (stop_type, label), count in lens_obj.items()
])
lens_df

fig = px.bar(
    lens_df, x='Count', y='Label', facet_col='StopType', facet_col_wrap=2,
    height=1350, width=1800, color='Count',
    log_x=True, text_auto=True, orientation='h'
)
fig.update_yaxes(title="Degree (# of routes passing)", type='category')
fig.update_xaxes(title="Count")

fig.update_layout(
    title="Degree distribution",
    # legend_title="Legend Title",
    font=dict(
        size=16,
    )
)

fig.show(renderer='notebook')
C:\Users\ADMIN\AppData\Local\Temp\ipykernel_31452\1622963036.py:5: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

C:\Users\ADMIN\AppData\Local\Temp\ipykernel_31452\1622963036.py:2: DeprecationWarning:

DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.

Relationship between Route distance and running time¶

In [ ]:
df_vars = pd.DataFrame([var.to_dict() for var in vars._objs.values()])

df_vars['RouteVarName'] = df_vars['StartStop'] + ' - ' + df_vars['EndStop']
df_vars['Speed'] = df_vars['Distance'] / df_vars['RunningTime'] * 0.06
df_vars['RouteId'] = df_vars['RouteIds'].apply(lambda ids: ids[0])
df_vars['RouteVarId'] = df_vars['RouteIds'].apply(lambda ids: ids[1])
df_vars = df_vars[['RouteId', 'RouteVarId', 'RouteNo', 'RouteVarName', 'Distance', 'RunningTime', 'Speed']]
df_vars
Out[ ]:
RouteId RouteVarId RouteNo RouteVarName Distance RunningTime Speed
0 3 5 03 Bến xe buýt Sài Gòn - THẠNH LỘC 21456.0 70 18.390857
1 3 6 03 THẠNH LỘC - Bến xe buýt Sài Gòn 21704.0 70 18.603429
2 1 1 01 Công Trường Mê Linh - Bến xe Chợ Lớn 8381.0 35 14.367429
3 1 2 01 Bến xe Chợ Lớn - Công Trường Mê Linh 9458.0 35 16.213714
4 7 13 07 Bến xe Chợ Lớn - BÃI HẬU CẦN SỐ 1 15907.0 70 13.634571
... ... ... ... ... ... ... ...
292 314 2 HS-94 Nguyen Van Huong - TH Nguyen Bỉnh Khiem 11142.0 80 8.356500
293 313 1 HS-93 Thao Dien - TH Nguyen Bỉnh Khiem 12986.0 90 8.657333
294 313 2 HS-93 TH Nguyen Bỉnh Khiem - Thao Dien 7277.0 80 5.457750
295 337 1 SWB1 Ga tàu thuỷ Bạch Đằng - Ga tàu thuỷ Linh Đông 11680.0 52 13.476923
296 337 2 SWB1 Ga tàu thuỷ Linh Đông - Ga tàu thuỷ Bạch Đằng 11785.0 52 13.598077

297 rows × 7 columns

In [ ]:
# px.histogram(filtered_vars_df['Distance'], nbins=100)
fig = px.scatter(
    df_vars, y='Distance', x='RunningTime', marginal_x='box', marginal_y='box', width=1600, height=900, 
    trendline='ols', trendline_options={"add_constant": False},
    hover_data=['RouteVarName', 'RouteId', 'RouteVarId'],
    color='Speed', color_continuous_scale=px.colors.sequential.Agsunset, range_color=[5, 40]
)
fig.update_layout(
    title="Relationship between Distance and RunningTime",
    # legend_title="Legend Title",
    font=dict(
        size=20,
    )
)
fig.show(renderer='notebook')

Running time of construction algorithms¶

In [ ]:
def run_gen(sides_set_type: str):
    BusNetwork.from_ndjsons(sides_set_type=sides_set_type)

NO_RUNS = 5

times = {}
for mode in ['default', 'spatial']:
    print('Running mode', mode)
    times[mode] = [
        timeit.timeit("run_gen(sides_set_type=mode)", globals=globals(), number=1)
        for _ in range(NO_RUNS)
    ]

times
Running mode default
sides_set_type =  default
100%|██████████| 297/297 [00:42<00:00,  6.92it/s]
sides_set_type =  default
100%|██████████| 297/297 [00:42<00:00,  7.00it/s]
sides_set_type =  default
100%|██████████| 297/297 [00:39<00:00,  7.54it/s]
sides_set_type =  default
100%|██████████| 297/297 [00:39<00:00,  7.60it/s]
sides_set_type =  default
100%|██████████| 297/297 [00:38<00:00,  7.71it/s]
Running mode spatial
sides_set_type =  spatial
100%|██████████| 297/297 [00:07<00:00, 38.92it/s]
sides_set_type =  spatial
100%|██████████| 297/297 [00:07<00:00, 38.38it/s]
sides_set_type =  spatial
100%|██████████| 297/297 [00:07<00:00, 38.87it/s] 
sides_set_type =  spatial
100%|██████████| 297/297 [00:07<00:00, 39.59it/s]
sides_set_type =  spatial
100%|██████████| 297/297 [00:07<00:00, 39.40it/s]
Out[ ]:
{'default': [43.38411490013823,
  42.890710399951786,
  39.85574440006167,
  39.51483920007013,
  38.97113410010934],
 'spatial': [8.083511600038037,
  8.161453299922869,
  8.059146600076929,
  7.91556299990043,
  7.955796599853784]}
In [ ]:
df_times = pd.DataFrame(times, index=['Run #{}'.format(x) for x in range(1, 1 + NO_RUNS)])

NO_ITERS = 297
for mode in ['default', 'spatial']:
    df_times[mode + '_it_per_sec'] = NO_ITERS / df_times[mode]

df_times = df_times[['default', 'default_it_per_sec', 'spatial', 'spatial_it_per_sec']]
df_times
Out[ ]:
default default_it_per_sec spatial spatial_it_per_sec
Run #1 43.384115 6.845824 8.083512 36.741458
Run #2 42.890710 6.924576 8.161453 36.390578
Run #3 39.855744 7.451874 8.059147 36.852537
Run #4 39.514839 7.516164 7.915563 37.521020
Run #5 38.971134 7.621025 7.955797 37.331271
In [ ]:
fin = df_times.sum() / len(df_times)
fin
Out[ ]:
default               40.923309
default_it_per_sec     7.271893
spatial                8.035094
spatial_it_per_sec    36.967373
dtype: float64
In [ ]:
fin['default'] / fin['spatial']
Out[ ]:
5.0930714039938225

Shortest paths¶

Shortest path between two arbitary nodes¶

In [ ]:
SRC, DEST = 35, 1234

print(stops[SRC])
print(stops[DEST])
==============[Bến xe buýt Sài Gòn]===============
| StopID:              35                        |
| Code:                BX 01                     |
| Name:                Bến xe buýt Sài Gòn       |
| Type:                Bến xe                    |
| Zone:                Quận 1                    |
| Ward:                Phường Phạm Ngũ Lão       |
| Address No.:         BẾN XE BUÝT SÀI GÒN       |
| Street:              Lê Lai                    |
| Support Disability?: True                      |
| Status:              Đang khai thác            |
| Lng:                 106.689362                |
| Lat:                 10.767676                 |
| Search tokens:       BxbSG, BXBSG, LL          |
| Routes:              03 -> 04 -> 102 -> 109 -> 120 -> 13 -> 140 -> 18 -> 19 -> 20 -> 27 -> 28 -> 34 -> 36 -> 39 -> 52 -> 61-6 -> 65 -> 69 -> 70-3 -> 72 -> 75 -> 86 -> 88 -> 93 -> D1
==================================================
==================[Ngã 3 Củ Cải]==================
| StopID:              1234                      |
| Code:                HHM 053                   |
| Name:                Ngã 3 Củ Cải              |
| Type:                Nhà chờ                   |
| Zone:                Huyện Hóc Môn             |
| Ward:                Xã Xuân Thới Đông         |
| Address No.:         43/1 (kế 33/5A), 7/2A     |
| Street:              Quốc lộ 22                |
| Support Disability?: True                      |
| Status:              Đang khai thác            |
| Lng:                 106.603324                |
| Lat:                 10.860602                 |
| Search tokens:       N3CC, 43/1(33/5A),7/2A, Ql22
| Routes:              122 -> 13 -> 62-5 -> 70-3 -> 74 -> 85 -> 94
==================================================
In [ ]:
dijkstra.shortest_path_to_json(net=net, src=SRC, dest=DEST, file='out.json')

with open('out.json', 'r', encoding='utf-8') as f:
    obj = json.load(f)
    print('[Shortest path from {} to {}]'.format(obj["Src"], obj["Dest"]))
    print("{} -> {}, time = {}, length = {}".format(obj["Src"], obj["Dest"], obj["PathTime"], obj["PathLength"]))
[Shortest path from 35 to 1234]
35 -> 1234, time = 54.631871747758225, length = 26006.492199032677
In [ ]:
geojson_data = {
    "type": "FeatureCollection",
    "features": [
        {
            "type": "Feature",
            "properties": {},
            "geometry": {
                "type": "LineString",
                "coordinates": [
                    list(reversed(vn2000_to_wgs84(lng, lat)))
                    for [lng, lat] in sum((edge["Path"] for edge in obj["Path"]), start=[])
                ]
            }
        }
    ]
}
In [ ]:
with open('stops.geojson', 'w', encoding='utf-8') as f:
    f.write(json.dumps(geojson_data, indent=4))
In [ ]:
m = Map(center=(10.82, 106.65), zoom=12)
m.add(GeoJSON(
    data = geojson_data
))
m
Out[ ]:
Map(center=[10.82, 106.65], controls=(ZoomControl(options=['position', 'zoom_in_text', 'zoom_in_title', 'zoom_…

No description has been provided for this image

Shortest paths between all pairs of nodes¶

In [ ]:
dijkstra.all_shortest_path_to_json(net=net, file='out.json')
# Do not run directly! The generated file could be up to GBs.
Out[ ]:
<generator object BusNetworkDijkstra.all_shortest_path_to_json at 0x000001FF34643DE0>

Top $k$ most influential Stops¶

In [ ]:
net_analysis.from_net(net=net, dijkstra_engine=dijkstra, alg='tree')
100%|██████████| 4397/4397 [02:26<00:00, 29.94it/s]
In [ ]:
def print_top(k = 10):
    print('[Top k = {} most influential Stops]'.format(k))
    top_stops_id = net_analysis.top_scores(10)
    for stop_id in top_stops_id:
        print(stop_id, net_analysis.scores[stop_id])

print_top()
[Top k = 10 most influential Stops]
268 2625614
267 2625614
1239 2614406
1115 2613916
1393 2607219
1152 2604321
270 2311589
271 2310622
174 2294866
510 2261665
In [ ]:
im_stops_df = pd.DataFrame(
    {
        **stops[stop_id].to_dict(),
        "Score": net_analysis.scores[stop_id]
    }
    for stop_id in net_analysis.top_scores(4397)
)
im_stops_df = im_stops_df[['StopId', 'Code', 'Name', 'StopType', 'Zone', 'Street', 'Lng', 'Lat', 'Score']].sort_values(by='Score')
In [ ]:
fig = px.scatter_geo(
    im_stops_df, lat='Lat', lon='Lng', color='Score', width=1400, height=1000, color_continuous_scale='rdpu', range_color=[0, 2000000],
    hover_data=['StopId', 'Code', 'Name', 'Zone', 'Score']
)
fig.update_layout(
    title='Spatial distribution of Betweenness score',
    font=dict(
        size=17
    ),
    geo=dict(
        scope = 'asia',
        resolution = 110,
        lonaxis_range= [106.4, 106.9],
        lataxis_range= [10.6, 11],
        landcolor = 'rgb(240, 240, 240)',
    )
)
fig.show(renderer='notebook')

Table of Top $k=10$ Stops¶

In [ ]:
stop_list = [
    {
        **stops[stop_id].to_dict(),
        "Score": net_analysis.scores[stop_id]
    }
    for stop_id in net_analysis.top_scores()
]

df = pd.DataFrame(stop_list, index=pd.RangeIndex(1, 11))
df['Routes'] = df['Routes'].apply(lambda routes: len(routes.split()))
df['Location'] = df['Street'] + ', ' + df['Zone']

df = df[['StopId', 'Name', 'Location', 'Street', 'Zone', 'StopType', 'Score']]

df
Out[ ]:
StopId Name Location Street Zone StopType Score
1 268 Mũi tàu Cộng Hòa Trường Chinh, Quận Tân Bình Trường Chinh Quận Tân Bình Trụ dừng 2625614
2 267 Ngã ba Chế Lan Viên Trường Chinh, Quận Tân Bình Trường Chinh Quận Tân Bình Nhà chờ 2625614
3 1239 Bến xe An Sương Quốc lộ 22, Huyện Hóc Môn Quốc lộ 22 Huyện Hóc Môn Trụ dừng 2614406
4 1115 Bến xe An Sương Quốc lộ 22, Quận 12 Quốc lộ 22 Quận 12 Trụ dừng 2613916
5 1393 Ngã tư Trung Chánh Quốc lộ 22, Huyện Hóc Môn Quốc lộ 22 Huyện Hóc Môn Nhà chờ 2607219
6 1152 Trung tâm Văn hóa Quận 12 Quốc lộ 22, Quận 12 Quốc lộ 22 Quận 12 Nhà chờ 2604321
7 270 UBND Phường 15 Trường Chinh, Quận Tân Bình Trường Chinh Quận Tân Bình Nhà chờ 2311589
8 271 Khu Công Nghiệp Tân Bình Trường Chinh, Quận Tân Bình Trường Chinh Quận Tân Bình Nhà chờ 2310622
9 174 Trạm Dệt Thành Công Trường Chinh, Quận Tân Phú Trường Chinh Quận Tân Phú Nhà chờ 2294866
10 510 Bệnh viện Quận Tân Bình Hoàng Văn Thụ, Quận Tân Bình Hoàng Văn Thụ Quận Tân Bình Nhà chờ 2261665

# of edge candidates $|E|$ per Stop¶

In [ ]:
def get_df(mode):
    return pd.DataFrame(
        {
            "RouteId": route_id,
            "RouteVarId": route_var_id,
            "PathLength": path_len,
            **pd.Series(per_stop).describe()
        }
        for (route_id, route_var_id), path_len, per_stop in BusNetwork._analyse_sides_set(sides_set_type=mode)
    )
In [ ]:
df_spatial = get_df('spatial')
df_spatial['Type'] = 'spatial'

df_default = get_df('default')
df_default['Type'] = 'default'
sides_set_type =  spatial
100%|██████████| 297/297 [00:06<00:00, 49.05it/s] 
sides_set_type =  default
100%|██████████| 297/297 [00:00<00:00, 603.26it/s]
In [ ]:
df_all = pd.concat([df_default, df_spatial]).reset_index().drop(columns='index')
df_all
Out[ ]:
RouteId RouteVarId PathLength count mean std min 25% 50% 75% max Type
0 3 5 483 53.0 483.0 0.000000 483.0 483.00 483.0 483.00 483.0 default
1 7 14 222 50.0 222.0 0.000000 222.0 222.00 222.0 222.00 222.0 default
2 1 1 249 29.0 249.0 0.000000 249.0 249.00 249.0 249.00 249.0 default
3 3 6 327 56.0 327.0 0.000000 327.0 327.00 327.0 327.00 327.0 default
4 7 13 235 44.0 235.0 0.000000 235.0 235.00 235.0 235.00 235.0 default
... ... ... ... ... ... ... ... ... ... ... ... ...
589 313 1 99 2.0 2.5 0.707107 2.0 2.25 2.5 2.75 3.0 spatial
590 314 2 73 2.0 1.5 0.707107 1.0 1.25 1.5 1.75 2.0 spatial
591 313 2 30 2.0 1.5 0.707107 1.0 1.25 1.5 1.75 2.0 spatial
592 337 1 54 5.0 6.8 3.898718 3.0 4.00 5.0 11.00 11.0 spatial
593 337 2 52 5.0 7.2 3.271085 4.0 4.00 7.0 10.00 11.0 spatial

594 rows × 12 columns

In [ ]:
df_default['mean'].describe()
Out[ ]:
count    297.000000
mean     247.265993
std      155.000806
min        6.000000
25%      154.000000
50%      240.000000
75%      333.000000
max      781.000000
Name: mean, dtype: float64
In [ ]:
df_spatial['mean'].describe()
Out[ ]:
count    297.000000
mean       6.320350
std        2.569525
min        1.000000
25%        4.650000
50%        6.275000
75%        7.660000
max       14.040000
Name: mean, dtype: float64
In [ ]:
fig = px.scatter(
    df_all, x='PathLength', y='count', marginal_x='box', marginal_y='box', width=1600, height=900, 
    trendline='ols', trendline_options={"add_constant": False},
    hover_data=['RouteId', 'RouteVarId', 'count', 'mean'],
    color='Type'
)
fig.show(renderer='notebook')
In [ ]:
fig = px.scatter(
    df_all, x='PathLength', y='mean', marginal_x='box', marginal_y='box', log_y=True, width=1600, height=900, 
    trendline='ols', trendline_options={"add_constant": True},
    hover_data=['RouteId', 'RouteVarId', 'count', 'mean'],
    color='Type'
)

fig.update_xaxes(title="# of Path sides")
fig.update_yaxes(title="# of candidate edges |E|")

fig.update_layout(
    font=dict(
        size=20,
    )
)
fig.show(renderer='notebook')

Running time between Betweenness Analysis algorithms¶

In [ ]:
def run_analysis(alg: str):
    _net_analysis = NetworkAnalysisBetweenness()
    _net_analysis.from_net(net=net, dijkstra_engine=None, alg=alg)

NO_RUNS = 5

times = {}
for mode in ['default', 'tree']:
    print('Running mode', mode)
    times[mode] = [
        timeit.timeit("run_analysis(alg=mode)", globals=globals(), number=1)
        for _ in range(NO_RUNS)
    ]

times
Running mode default
100%|██████████| 4397/4397 [09:28<00:00,  7.74it/s]
100%|██████████| 4397/4397 [08:54<00:00,  8.22it/s]
100%|██████████| 4397/4397 [07:40<00:00,  9.55it/s]
100%|██████████| 4397/4397 [07:30<00:00,  9.76it/s]
100%|██████████| 4397/4397 [07:16<00:00, 10.06it/s]
Running mode tree
100%|██████████| 4397/4397 [02:28<00:00, 29.56it/s]
100%|██████████| 4397/4397 [02:21<00:00, 31.05it/s]
100%|██████████| 4397/4397 [02:23<00:00, 30.59it/s]
100%|██████████| 4397/4397 [02:17<00:00, 32.01it/s]
100%|██████████| 4397/4397 [02:17<00:00, 31.92it/s]
Out[ ]:
{'default': [568.2074843998998,
  534.7812418001704,
  460.3806853001006,
  450.74478500010446,
  436.9792862001341],
 'tree': [148.74347339989617,
  141.63267030008137,
  143.76318740006536,
  137.3843427998945,
  137.74046280002221]}
In [ ]:
df_times = pd.DataFrame(times, index=['Run #{}'.format(x) for x in range(1, 1 + NO_RUNS)])

NO_ITERS = 4397
for mode in ['default', 'tree']:
    df_times[mode + '_it_per_sec'] = NO_ITERS / df_times[mode]

df_times = df_times[['default', 'default_it_per_sec', 'tree', 'tree_it_per_sec']]
df_times
Out[ ]:
default default_it_per_sec tree tree_it_per_sec
Run #1 568.207484 7.738370 148.743473 29.560961
Run #2 534.781242 8.222054 141.632670 31.045097
Run #3 460.380685 9.550792 143.763187 30.585020
Run #4 450.744785 9.754966 137.384343 32.005103
Run #5 436.979286 10.062262 137.740463 31.922355
In [ ]:
fin = df_times.sum() / len(df_times)
fin
Out[ ]:
default               490.218697
default_it_per_sec      9.065689
tree                  141.852827
tree_it_per_sec        31.023707
dtype: float64
In [ ]:
print('Improvement: {}%'.format((fin['default'] / fin['tree'] * 100).round(2)))
Improvement: 345.58%